Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio. Resources Available The historical data for this project is available in file https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
Input variables: Bank client data:
import os,sys
import pandas as pd
import numpy as np
from scipy import stats
# importing ploting libraries
import matplotlib.pyplot as plt
#importing seaborn for statistical plots
import seaborn as sns
sns.distributions._has_statsmodels=False
# To enable plotting graphs in Jupyter notebook
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# calculate accuracy measures and confusion matrix
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, classification_report, confusion_matrix,auc
from sklearn.metrics import recall_score,precision_score, f1_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# Set color code,font scale..
sns.set(color_codes=True,rc={'figure.figsize':(35.0,35.0)},font_scale=1)
%matplotlib inline
dataframe = pd.read_csv("bank-full.csv")
dataframe['Target'].value_counts()
dataframe
dataframe.shape
#Unique Values
display(dataframe.nunique())
dataframe.info()
#Can Check if it there is any Null Values with below way
print(dataframe.info())
#SHAPE
print(dataframe.shape)
#Describe-FIVE POINTS SUMMARY
print(dataframe.describe())
print(dataframe.isnull().any())
print(dataframe.isnull().sum().sum())
display(dataframe.isnull().sum())
There are 7 integer(Numeric) type attributes
Target: Means , has the client subscribed a term deposit? ( 'yes','no')
dataframe.describe().T
Insights From Descriptive:
Age
Balance:
#Creating Profile Report for Analysis
#!pip install pandas_profiling
import pandas_profiling
dataframe.profile_report()
previous is highly skewed (γ1 = 41.84645447) Skewed
balance has 3514 (7.8%) zeros Zeros
previous has 36954 (81.7%) zeros Zeros
df=dataframe
f, ax = plt.subplots(1,2, figsize=(16,8))
colors = ["#FB7858", "#34FE2E"]
labels ="Did not subscribed a term deposit", "subscribed a term deposit"
plt.suptitle('Information on Term Deposit', fontsize=20)
df["Target"].value_counts().plot.pie(explode=[0,0.25], autopct='%1.2f%%', ax=ax[0], shadow=True, colors=colors,
labels=labels, fontsize=12, startangle=5)
ax[0].set_ylabel('% of Condition of Loans', fontsize=14)
palette = ["#65FE2E", "#FA5858"]
sns.barplot(x="education", y="balance", hue="Target", data=df, palette=palette, estimator=lambda x: len(x) / len(df) * 100)
ax[1].set(ylabel="(%)")
ax[1].set_xticklabels(df["education"].unique(), rotation=0, rotation_mode="anchor")
plt.show()
import matplotlib.pyplot as plt
df.hist(bins=20, figsize=(14,10), color='green')
plt.show()
As we can see from the above descriptive stats there seems to be no missing values But there are value like, “other”, which are not adding any significance. These values are removed from the dataset.
dataframe['poutcome'].value_counts()
# Step 1: Delete the rows in the 'poutcome' columns where values is 'other' as it is not helping in any inferences
delcondition = dataframe.poutcome == 'other'
dataframe.drop(dataframe[delcondition].index, axis = 0, inplace = True)
print("dataframe is :", dataframe['poutcome'].value_counts())
df=dataframe
dataframe.shape
Let's Convert duration attributes from seconds to minute to have better measure and analysis
newdf = dataframe.copy()
newdf['duration'] = newdf['duration'].apply(lambda n:n/60).round(2)
plt.style.use('seaborn-whitegrid')
newdf.hist(bins=20, figsize=(15,10), color='green', edgecolor = 'black')
plt.show()
#print("Descriptive stats of age",dataframe1['age'].describe())
print("stats of duration",newdf['duration'].describe())
print("stats of campaign",newdf['campaign'].describe())
print("stats of day", newdf['day'].describe())
print("stats of no of day past the campaign was last done: ", newdf['pdays'].describe())
newdf[newdf['balance']<0]
As observed from the historgram plot, the duration of contact has a median of 3 minutes, with an interquartile range of 1.73 minutes to 5.3 minutes. The left-skewed boxplot indicates that most calls are relatively short. Also, there is a large number of outliers.
The distribution of campagin: About half of the clients have been contacted by the bank for the second time, while 25% was first introduced to the term deposit. Most clients have been reached by the bank for one to three times, which is reasonable. However, some clients have been contacted by as high as 63 times, which is not normal. These clients may have some special needs that require frequent contact.
plt.figure(figsize= (20,15))
plt.subplot(3,3,1)
sns.boxplot(x= newdf.age, color='green')
plt.subplot(3,3,2)
sns.boxplot(x= newdf.balance, color='green')
plt.subplot(3,3,3)
sns.boxplot(x= newdf.day, color='green')
plt.show()
plt.figure(figsize= (20,15))
plt.subplot(4,4,1)
sns.boxplot(x= newdf.duration, color='green')
plt.subplot(4,4,2)
sns.boxplot(x= newdf.campaign, color='green')
plt.subplot(4,4,3)
sns.boxplot(x= newdf.pdays, color='green')
plt.subplot(4,4,4)
sns.boxplot(x= newdf.previous, color='green')
plt.show()
There seems to be an outlier in age, between the age group 70-90.
Max customer falls in the age range of 32-48 . Median age is very close to 40 which i feel banks have targeted considering that hey will will more aligned to take fd option.
In balance attribute there seems to be high level of skewness which we also observed in our histplot analayis. There are some extreme outliers brtween 55k euro to 100k euro. There are long tails towards right side so it seems to be highly skewed.
Balance attribute needs to be treated for removing the outlier which we will deal next.
Day attribite seems to be normally distributed with no significant outliers as such.
Rest of the numerical attribute doesn,t seems to be significant and are highly skewed with long tail mostly on the right side. As they are mostly skewed specially pdays and previous days,. There is nothing to statistically find here.
#SKEWNESS
from scipy.stats import zscore
import scipy.stats as stats
#Let's check Skew in all numercial attributes
Skewness = pd.DataFrame({'Skewness' : [stats.skew(dataframe.age),stats.skew(dataframe.day),stats.skew(dataframe.balance),stats.skew(dataframe.duration),stats.skew(dataframe.campaign),stats.skew(dataframe.pdays),stats.skew(dataframe.previous) ]},
index=['age','day','balance', 'duration', 'campaign', 'pdays', 'previous']) # Measure the skeweness of the required columns
Skewness
#Removing outliers in balance data using zscore:
from scipy.stats import zscore
newdf[['balance']].mean()
newdf[['balance']].mean()
newdf['balance_outliers'] = newdf['balance']
newdf['balance_outliers']= zscore(newdf['balance_outliers'])
condition1 = (newdf['balance_outliers']>3) | (newdf['balance_outliers']<-3 )
newdf1 = newdf.drop(newdf[condition1].index, axis = 0, inplace = False)
newdf2 = newdf1.drop('balance_outliers', axis=1)
#original one
plt.figure(figsize= (20,15))
sns.boxplot(x= newdf.balance, color='lightblue')
#After outlier treatment using z score
plt.figure(figsize= (20,15))
sns.boxplot(x= newdf2.balance, color='lightblue')
print("We managed to get rid to some extreme outlier shown below. ")
newdf=newdf2
objdf = newdf.select_dtypes(include ='object')
objdf.head(5)
objdf.columns
## category columns
category_variables = objdf.columns
for i in category_variables:
display(i)
display(df[i].value_counts(normalize=False))
fig=plt.figure(figsize=(50,20))
for i,col in enumerate(category_variables):
ax=fig.add_subplot(5,5,i+1)
sns.countplot(df[col])
Job Type Distribution:
blue-collar 9331
management 9065
technician 7298
admin. 4930
services 3977
retired 2181
self-employed 1518
entrepreneur 1448
unemployed 1266
housemaid 1214
student 857
unknown 286
Job:
MaritalStataus:
married 26227
single 12160
divorced 4984
EDUCATION:
Education Level Distribution:
secondary 22215
tertiary 12757
primary 6607
unknown 1792
It seems that around 51 % of the targeted customers are having secondar level of education and around 29 % customers are having qualification beyoind 12th class. It means that the targeted customers 51 % with secondary level education is somewhat not as per expectation and the campaign will not bear fruitful result. We neded to rethink the target audience here. Our focus should be more on working profesionals whose qualification will be beyound secondary level.
We also found some unknown customers whose level of education is not clear this needs to be ignored or resampled.
Arounf 6851 customers are still studying and should not be our target audience.They are very less liskely to take fd option. We will verify the same going forward.
Credit Default:
Distribution: no 42572
yes 799
- It is good to see that almost 98 % of the targeted customers don't default
- Only 799 are defaulter and are not credit worthy. These people needs to be ignored while targeting for fd conversion.
no 36392
yes 6979
- 83 % customers have no personal loan lying with them only 17% people who were targeted have personal loan availed.
plt.figure(figsize=(20,25))
#Mode of communication with customers
x6 = newdf.contact.value_counts().index #Values for x-axis
y6 = [newdf['contact'].value_counts()[o] for o in x6] # Count of each class on y-axis
print("\nDistribution Of Mode Of Communication With Customers: ", newdf.contact.value_counts())
plt.subplot(4,2,1)
plt.bar(x6,y6, align='center',color = 'green',edgecolor = 'black',alpha = 0.7) #plot a bar chart
plt.xlabel('Contact Type ?')
plt.ylabel('Count ')
plt.title("Contact Type Distribution")
#communication result
x7 = newdf.poutcome.value_counts().index #Values for x-axis
y7 = [newdf['poutcome'].value_counts()[p] for p in x7] # Count of each class on y-axis
print("\nDistribution Of communication result: ", newdf.poutcome.value_counts())
plt.subplot(4,2,2)
plt.bar(x7,y7, align='center',color = 'green',edgecolor = 'black',alpha = 0.7) #plot a bar chart
plt.xlabel('Phone Call Outcome ?')
plt.ylabel('Count ')
plt.title("Phone Call Outcome Distribution")
#month when customer was last contacted
x8 = newdf.month.value_counts().index #Values for x-axis
y8 = [newdf['month'].value_counts()[q] for q in x8] # Count of each class on y-axis
print("\nDistribution Of monthly customer contact detail : ", newdf.month.value_counts())
plt.subplot(4,2,3)
plt.bar(x8,y8, align='center',color = 'green',edgecolor = 'black',alpha = 0.7) #plot a bar chart
plt.xlabel('Month Contacted ?')
plt.ylabel('Count ')
plt.title("Month Contacted Distribution")
#FD status Which is our target variable
x9 = newdf.Target.value_counts().index #Values for x-axis
y9 = [newdf['Target'].value_counts()[r] for r in x9] # Count of each class on y-axis
print("\nDistribution Of customer W.R.T FD : ", newdf.Target.value_counts())
plt.subplot(4,2,4)
plt.bar(x9,y9, align='center',color = 'green',edgecolor = 'black',alpha = 0.7) #plot a bar chart
plt.xlabel('FD Status ?')
plt.ylabel('Count ')
plt.title(" FD status Distribution")
plt.show()
Quick Observation On Above Category Plot :
cellular 27168
unknown 12820
telephone 2667
- Around 64 % mobile phone was used to contact potential customer. It will be interesting to see what was the call duration for the same, as it will impact the conversion rate to larger extent
- Almost 28 % people mode of communication has ot been captured as needed with unknown categorization . These data don't seems to add any value as we don't know how to infer the outcome here.
- Landline as expected has very low share as a mode of client communication.
unknown 36358
failure 4827
success 1470
may 13048
jul 6794
aug 6050
jun 5163
nov 3599
apr 2627
feb 2354
jan 1252
oct 651
sep 503
mar 432
dec 182
May month seems to have more frequency(13048) where banl connected to the potential customer for FD campaign. It can be also dure to previous year financial closing and to add new customers for FD instrument, to increase bank revenue.
Jun, July , August also have far share but not as mich compared to may month
Target(FD) Variable :
no 37785
yes 4870
- Most of the contacted customers seems to be not interested in FD.
We Will making using of count plot with Target var as hue.
lst = [newdf2]
for column in lst:
column.loc[column["age"] < 30, 'age_group'] = 20
column.loc[(column["age"] >= 30) & (column["age"] <= 39), 'age_group'] = 40
column.loc[(column["age"] >= 50) & (column["age"] <= 59), 'age_group'] = 50
column.loc[(column["age"] >= 60) & (column["age"] <= 69), 'age_group'] = 60
column.loc[column["age"] >= 70, 'age_group'] = 70
count_age_response_pct = pd.crosstab(newdf2['Target'],newdf2['age_group']).apply(lambda x: x/x.sum() * 100)
count_age_response_pct = count_age_response_pct.transpose()
age = pd.DataFrame(newdf2['age_group'].value_counts())
age['% Contacted'] = age['age_group']*100/age['age_group'].sum()
age['% FD Subscription'] = count_age_response_pct['yes']
age.drop('age_group',axis = 1,inplace = True)
age['age'] = [30,40,50,20,60]
age = age.sort_values('age',ascending = True)
plt.figure(figsize=(20,10))
sns.countplot(newdf2['age_group'], hue = "Target", data=newdf2)
plt.tight_layout()
Quick Insights:
- Term Deposit rate is higher among young people between the age of 30-40
plt.figure(figsize=(20,10))
sns.countplot(newdf2.job, hue = "Target", data=newdf2)
plt.tight_layout()
Quick Insights:
- Management professional seems to be more likely to avail FD option here.
- Technician also seems to faring well along with blue collar professionals.
plt.figure(figsize=(20,10))
sns.countplot(newdf2.marital, hue = "Target", data=newdf2)
plt.tight_layout()
Marital Status Impacts On Term Deposit :
plt.figure(figsize=(20,10))
sns.countplot(newdf2.education, hue = "Target", data=newdf2)
plt.tight_layout()
Quick Insights:
plt.figure(figsize=(20,10))
sns.countplot(newdf2.loan, hue = "Target", data=newdf2)
plt.tight_layout()
Quick Insights:
plt.figure(figsize=(20,10))
sns.countplot(newdf2.housing, hue = "Target", data=newdf2)
plt.tight_layout()
Quick Insights:
plt.figure(figsize=(20,10))
sns.countplot(newdf2.default, hue = "Target", data=newdf2)
plt.tight_layout()
Quick Insights:
- As expected customer with no default history are more liley to opt for Term Deposit
plt.figure(figsize=(20,10))
sns.countplot(newdf2.contact, hue = "Target", data=newdf2)
plt.tight_layout()
Quick Insights:
- As expected cellualr as a mode of communication is leading the count and also Term Deposit
plt.figure(figsize=(20,10))
sns.countplot(newdf2.month, hue = "Target", data=newdf2)
plt.tight_layout()
Quick Insights:
- May month seems to be best time to conact customer for increasing the rate of Term Deposit
- jun, july, aug also seems to be farring well but march, dec and jan doesn't look idel time to campaign.
plt.figure(figsize=(20,10))
sns.countplot(newdf2.campaign, hue = "Target", data=newdf2)
plt.tight_layout()
if the customer is contacted not more than 5-6 times they are more likely to opt for Term Deposit.
Lets see how the bank balance and age are related . We removed some outlier in our balance attribute let;s use that data to see the result
scatter_age_balance = newdf2.plot.scatter('age','balance',figsize = (15,10))
plt.title('Age and Balance ')
plt.show()
There seems to no linear relationship between age & balance and Based on this scatter plot, there is no clear relationship between client’s age and balance level.
plt.figure(figsize=(15,10))
sns.scatterplot(newdf2.duration, newdf2.campaign, palette= ['green','green'] )
plt.show()
Obseravtion: we can see some kind trend here, when frequency of contacting the same client increases, call duration which happens with the client decreases, it's a kind of negative correlation.So it is advisable that the less is the frequency longer is the call duration which may result in better conversion rate.
Let's see how education impacts the bank balance of the customer
fig = plt.figure(figsize=(40,30))
ax1 = fig.add_subplot(221)
ax2= fig.add_subplot(221)
ax1 = sns.boxplot(newdf2['education'], newdf2['balance'], data=newdf2, ax =ax1)
print("------------------")
print("MEAN")
print("------------------")
print(newdf2.groupby('Target').mean())
print("------------------")
print("MEDIAN")
print("------------------")
print(newdf2.groupby('Target').median())
Quick Insights:
lst = [newdf2]
for column in lst:
column.loc[column["age"] < 30, 'age_group'] = 20
column.loc[(column["age"] >= 30) & (column["age"] <= 39), 'age_group'] = 30
column.loc[(column["age"] >= 40) & (column["age"] <= 49), 'age_group'] = 40
column.loc[(column["age"] >= 50) & (column["age"] <= 59), 'age_group'] = 50
column.loc[column["age"] >= 60, 'age_group'] = 60
count_age_response_pct = pd.crosstab(newdf2['Target'],newdf2['age_group']).apply(lambda x: x/x.sum() * 100)
count_age_response_pct = count_age_response_pct.transpose()
print(count_age_response_pct)
age = pd.DataFrame(newdf2['age_group'].value_counts())
age['Contacted'] = age['age_group']*100/age['age_group'].sum()
age['Term Depsit'] = count_age_response_pct['yes']
age.drop('age_group',axis = 1,inplace = True)
age['age'] = [30,40,50,20,60]
age = age.sort_values('age',ascending = True)
plot_age = age[['Term Depsit','Contacted']].plot(kind = 'bar', figsize=(8,6), color = ('green','orange'))
plt.xlabel('Age Group')
plt.ylabel('Subscription Rate')
plt.xticks(np.arange(5), ('<30', '30-39', '40-49', '50-59', '60+'),rotation = 'horizontal')
plt.title('Subscription vs. Rate by Age')
plt.show()
Orange Bar plot indicate that clients with a age of 60+ have the highest subscription rate. About 17% of the subscriptions came from the clients aged between 18 to 29. More than 50% of the subscriptions are contributed by the youngest and the eldest clients.
#Let's Seggregate The Balance & Perform Transaformation To Have Better Insights
lst = [newdf2]
for column in lst:
column.loc[column["balance"] <= 0, 'balance_group'] = 'no balance'
column.loc[(column["balance"] > 0) & (column["balance"] <= 1000), 'balance_group'] = 'low balance'
column.loc[(column["balance"] > 1000) & (column["balance"] <= 5000), 'balance_group'] = 'average balance'
column.loc[(column["balance"] > 5000), 'balance_group'] = 'high balance'
count_balance_response_pct = pd.crosstab(newdf2['Target'],newdf2['balance_group']).apply(lambda x: x/x.sum() * 100)
count_balance_response_pct = count_balance_response_pct.transpose()
bal = pd.DataFrame(newdf2['balance_group'].value_counts())
bal['% Contacted'] = bal['balance_group']*100/bal['balance_group'].sum()
bal['% Term Deposit'] = count_balance_response_pct['yes']
bal.drop('balance_group',axis = 1,inplace = True)
bal['bal'] = [1,2,0,3]
bal = bal.sort_values('bal',ascending = True)
plot_balance = bal[['% Term Deposit','% Contacted']].plot(kind = 'bar',
color = ('green','red'),
figsize = (8,6))
plt.title('Subscription vs Contact Rate by Balance Level')
plt.ylabel('Subscription Rate')
plt.xlabel('Balance Category')
plt.xticks(rotation = 'horizontal')
# label the bar
for rec, label in zip(plot_balance.patches, bal['% Term Deposit'].round(1).astype(str)):
plot_balance.text(rec.get_x() + rec.get_width()/2, rec.get_height() + 1, label+'%', ha = 'center', color = 'black')
print(count_balance_response_pct)
#ax = sns.boxplot(newdf2['Target'], newdf2['balance'], data=newdf2)
Quick Insights For FD subscription rate against balance level:
To identify the trend more clearly, we have categorized customers into four groups based on their levels of balance:
No Balance: clients with a negative balance. Low Balance: clients with a balance between 0 and 1000 euros Average Balance: clients with a balance between 1000 and 5000 euros. High Balance: clients with a balance greater than 5000 euros.
Our Box Plot shows a positive correlation between clients’ balance levels and fd subscription rate. Clients with negative balances only returned a fd subscription rate of 6.8% while clients with average or high balances had significantly higher FD subscription rates, nearing to 15%.
We have seen above that more than 50% of customers contacted had the low balance level.So bank has to take care in the future to target cusytomers with high bank balance level. People with negative balance are less likely to subscribe to FD.
count_job_target_pct = pd.crosstab(newdf2['Target'],newdf2['job']).apply(lambda x: x/x.sum() * 100)
count_job_target_pct = count_job_target_pct.transpose()
plot_job = count_job_target_pct['yes'].sort_values(ascending = True).plot(kind ='barh',figsize = (12,6))
plt.title('Term Deposit Rate by Job')
plt.xlabel('Term Deposit Subscription Rate')
plt.ylabel('Job Category')
for rec, label in zip(plot_job.patches, count_job_target_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_job.text(rec.get_width()+0.8, rec.get_y()+ rec.get_height()-0.5, label+'%', ha = 'center')
Quick Insghts: For Term Deposit rate by job¶¶
count_marital_target_pct = pd.crosstab(newdf2['Target'],newdf2['marital']).apply(lambda x: x/x.sum() * 100)
count_marital_target_pct = count_marital_target_pct.transpose()
plot_marital = count_marital_target_pct['yes'].sort_values(ascending = True).plot(kind ='barh',figsize = (12,6))
plt.title('Term Deposit Rate by Marital Status')
plt.xlabel('Term Deposit Rate')
plt.ylabel('Marital Status')
# Label each bar
for rec, label in zip(plot_marital.patches, count_marital_target_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_marital.text(rec.get_width()+0.8, rec.get_y()+ rec.get_height()-0.5, label+'%', ha = 'center')
Quick Insights : Marial Stataus Impact On Term Deposit Subscription:
count_education_target_pct = pd.crosstab(newdf2['Target'],newdf2['education']).apply(lambda x: x/x.sum() * 100)
count_education_target_pct= count_education_target_pct.transpose()
plot_education = count_education_target_pct['yes'].sort_values(ascending = True).plot(kind ='barh',figsize = (12,6))
plt.title('Term Deposit Rate by Education Level')
plt.xlabel('Term Deposit Rate')
plt.ylabel('Education Leve;')
# Label each bar
for rec, label in zip(plot_education.patches, count_education_target_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_education.text(rec.get_width()+0.8, rec.get_y()+ rec.get_height()-0.5, label+'%', ha = 'center')
Quick Insights: people education level distribution who opted for Term Deposit
count_loan_target_pct = pd.crosstab(newdf2['Target'],newdf2['loan']).apply(lambda x: x/x.sum() * 100)
count_loan_target_pct= count_loan_target_pct.transpose()
plot_loan = count_loan_target_pct['yes'].sort_values(ascending = True).plot(kind ='barh',figsize = (12,6))
plt.title('Term Deposit Rate by Personal Loan Status')
plt.xlabel('Term Deposit Rate')
plt.ylabel('Personal Laon Status')
# Label each bar
for rec, label in zip(plot_loan.patches, count_loan_target_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_loan.text(rec.get_width()+0.8, rec.get_y()+ rec.get_height()-0.5, label+'%', ha = 'center')
Quick insights on Term Deposit Based On Personal Loan Status Of The Customers:
- Person with personal loan on their head seems to less interested in fd , as you can see only 6.7 % of them availed FD option
- As expected people with no loan on their head a more likely to avail FD options. As it can be seen here that around 12.6 % customer with no pl opted for FD
count_creditdefault_target_pct = pd.crosstab(newdf2['Target'],newdf2['default']).apply(lambda x: x/x.sum() * 100)
count_creditdefault_target_pct= count_creditdefault_target_pct.transpose()
plot_credit_default = count_creditdefault_target_pct['yes'].sort_values(ascending = True).plot(kind ='barh',figsize = (12,6))
plt.title('Term Deposit Rate by Credit Default Status')
plt.xlabel('Term Deposit Rate')
plt.ylabel('Credit Default Status')
# Label each bar
for rec, label in zip(plot_credit_default.patches, count_creditdefault_target_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_credit_default.text(rec.get_width()+0.8, rec.get_y()+ rec.get_height()-0.5, label+'%', ha = 'center')
Quick Insights On How Credit Default Status Is Related To Term Deposit:
- We can see that person with no default credit stastus has higher Term Deposit subscription which is as per expectations. They account for around 12 %.
- Person with credit default history accounts for 6.4 % Term Deposit.
count_housingloan_target_pct = pd.crosstab(dataframe['Target'],dataframe['housing']).apply(lambda x: x/x.sum() * 100)
count_housingloan_target_pct= count_housingloan_target_pct.transpose()
plot_housing_loan = count_housingloan_target_pct['yes'].sort_values(ascending = True).plot(kind ='barh',figsize = (12,6))
plt.title('Term Deposit Rate by Housing Loan Status')
plt.xlabel('Term Deposit Rate')
plt.ylabel('Housing Loan Status')
# Label each bar
for rec, label in zip(plot_housing_loan.patches, count_housingloan_target_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_housing_loan.text(rec.get_width()+0.8, rec.get_y()+ rec.get_height()-0.5, label+'%', ha = 'center')
Quick Insights:
- customers with houing loan on their head are less likley to avail Term Deposit, here only 8 % of the customers with already having housing loan availed fd option. This is as expected
- around 17 % of the cutomers with no housing loan opted for Term Deposit .
count_modeofcomm_target_pct = pd.crosstab(dataframe['Target'],dataframe['contact']).apply(lambda x: x/x.sum() * 100)
count_modeofcomm_target_pct= count_modeofcomm_target_pct.transpose()
plot_comm_mode = count_modeofcomm_target_pct['yes'].sort_values(ascending = True).plot(kind ='barh',figsize = (12,6))
plt.title('Term Deposit Rate by Mode Of Contact')
plt.xlabel('Term Deposit Subscription Rate')
plt.ylabel('Mode Of Customer Contact')
# Label each bar
for rec, label in zip(plot_comm_mode.patches, count_modeofcomm_target_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_comm_mode.text(rec.get_width()+0.8, rec.get_y()+ rec.get_height()-0.5, label+'%', ha = 'center')
Quick Insight :
- Out of total customer who opted for Term Deposit , 15 % of them were converted usng cell phone as mode of contact. which seems to normal and expected
- Telephine seems to be quite close to mobile and accounts for 13.4 % of Term Deposit conversion.
plt.figure(figsize=(30,15))
ax = sns.boxplot(newdf2['Target'], newdf2['duration'], data=dataframe)
plt.tight_layout()
Quick Insight:
- If the call duration was less than 3-5 minutes customers were less likely to subscribe to Term Deposit
- But If the call duration lasted between 5-10 minutes , customers were more likely to take Term Deposit
count_monthofcontact_target_pct = pd.crosstab(dataframe['Target'],dataframe['month']).apply(lambda x: x/x.sum() * 100)
count_monthofcontact_target_pct= count_monthofcontact_target_pct.transpose()
print(count_monthofcontact_target_pct)
plot_mnth_contact = count_monthofcontact_target_pct['yes'].sort_values(ascending = True).plot(kind ='barh',figsize = (12,6))
plt.title('Term Deposit Rate by Month Of Contact')
plt.xlabel('Term Deposit Rate')
plt.ylabel('Month Of Last Customer Contact')
# Label each bar
for rec, label in zip(plot_mnth_contact.patches, count_monthofcontact_target_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_mnth_contact.text(rec.get_width()+0.8, rec.get_y()+ rec.get_height()-0.5, label+'%', ha = 'center')
Quick Insight On Impact of customer contact monthwise on Term Deposit:
-The highest Term Deposit subscription rate occurred in March, which is over 50%, and all subscription rates in September, October, and December are over 40%.Clearly, this gives some insights to the bank when to plan their campaign. Having said that they should also count external factor related to weather etc while planning their marketing camapign.
ax = sns.boxplot(newdf2['balance'], newdf2['marital'], data=newdf2)
Quick Insights:
- As seen form the box plot above married person have better financial status as compared to single & divorced person.
ax = sns.boxplot(newdf2['balance'], newdf2['education'], data=newdf2)
Quick Insight:
- The level of education also has a significant impact on the amount of balance a prospect has. As shown above the person with higher education has higher bank balance as compared to promary level educated person.
- Unknown category whose info is not clear seems to be having more bank balance than primary, this needs to be taken care by bank executive to find the actual education level.
- There is one interesting observation against people with secondary level eduaction , they seems to have low balance which can be due to the expense they need to plan for higher eduaction.
- Also person with higher education level also have more no. of person with negative bank balance with some outlier.
ax = sns.boxplot(newdf2['loan'], newdf2['balance'], data=newdf2)
Insights:
- It is clear that person with previous PL seems to be struggling with bank balance as expected
ax = sns.boxplot(newdf2['housing'], newdf2['balance'], data=newdf2)
Quick Insights:
ax = sns.boxplot(newdf2['default'], newdf2['balance'], data=newdf2)
Quick Insight:
campaign_call_duration = sns.lmplot(x='duration', y='campaign',data = newdf2,hue = 'Target',fit_reg = False, scatter_kws={'alpha':0.6}, height =7)
plt.axis([0,65,0,65])
plt.ylabel('Number of Calls')
plt.xlabel('Duration of Calls (Minutes)')
plt.title('The Relationship between the Number and Duration of Calls (with Response Result)')
# Annotation
plt.axhline(y=6, linewidth=2, color="k", linestyle='--')
plt.annotate('Higher subscription rate when calls <6',xytext = (35,13),arrowprops=dict(color = 'k', width=1),xy=(30,6))
plt.show()
# ax.set_xticklabels(df["default"].unique(), rotation=45, rotation_mode="anchor")
# plt.style.use('dark_background')
fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(221)
ax = sns.boxplot(newdf2['default'], newdf2['balance'], hue = "Target", data=newdf2)
#imapct of job type, balance on fd suscription.
fig = plt.figure(figsize=(40,30))
ax1 = fig.add_subplot(221)
ax1 = sns.boxplot(newdf2['education'], newdf2['balance'], hue = "Target", data=newdf2)
Quick Insights:
#imapct of education, balance on fd suscription.
fig = plt.figure(figsize=(40,30))
ax = fig.add_subplot(221)
ax = sns.boxplot(newdf2['job'], newdf2['balance'], hue = "Target", data=newdf2)
Quick Insights:
- Customers who are retired seems to have more bank balance and more FD subscription rate.
- Management professional seems to have less negative bank balance as compared to retired and also have second highest bank balance and fd subscription rate
- student doesn't seems to have negative bank balance ans so is the case with housemaid who seems to have very little negatuve bank balance .
#imapct of marital status, balance on fd suscription.
fig = plt.figure(figsize=(40,30))
ax = fig.add_subplot(221)
ax = sns.boxplot(newdf2['marital'], newdf2['balance'], hue = "Target", data=newdf2)
Insights:
#imapct of personal loan status, balance on fd suscription.
fig = plt.figure(figsize=(40,30))
ax = fig.add_subplot(221)
ax = sns.boxplot(newdf2['loan'], newdf2['balance'], hue = "Target", data=newdf2)
Insights: customers with no Personal loan seems to have higher bank balance and higer FD subscription rate
fig = plt.figure(figsize=(40,30))
ax = fig.add_subplot(221)
ax = sns.boxplot(newdf2['housing'], newdf2['balance'], hue = "Target", data=newdf2)
Insights: Customers with no housing loan seems to have higher bank balance and so are more likely to opt for FD.
#imapct of contact type, bank balance on fd suscription.
fig = plt.figure(figsize=(40,30))
ax = fig.add_subplot(221)
ax = sns.boxplot(newdf2['age_group'], newdf2['balance'], hue = "Target", data=newdf2)
#CORRELATION MATRIX FOR ALL THE NUMERICAL ATTRIBUTES:
newdf2.corr()
# Let's Change 'month' from words to numbers for easier analysis
lst = [newdf2]
for column in lst:
column.loc[column["month"] == "jan", "month_int"] = 1
column.loc[column["month"] == "feb", "month_int"] = 2
column.loc[column["month"] == "mar", "month_int"] = 3
column.loc[column["month"] == "apr", "month_int"] = 4
column.loc[column["month"] == "may", "month_int"] = 5
column.loc[column["month"] == "jun", "month_int"] = 6
column.loc[column["month"] == "jul", "month_int"] = 7
column.loc[column["month"] == "aug", "month_int"] = 8
column.loc[column["month"] == "sep", "month_int"] = 9
column.loc[column["month"] == "oct", "month_int"] = 10
column.loc[column["month"] == "nov", "month_int"] = 11
column.loc[column["month"] == "dec", "month_int"] = 12
def convert(newdf2, new_column, old_column):
newdf2[new_column] = newdf2[old_column].apply(lambda x: 0 if x == 'no' else 1)
return newdf2[new_column].value_counts()
corr_data = newdf2[['age','balance','day','duration','campaign','pdays','month_int', 'previous','Target']]
corr = corr_data.corr()
print("Correlation Matrix")
print(corr)
cor_plot = sns.heatmap(corr,annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':10})
fig=plt.gcf()
fig.set_size_inches(20,10)
plt.xticks(fontsize=10,rotation=-30)
plt.yticks(fontsize=10)
plt.title('Correlation Matrix')
plt.show()
sns.pairplot(newdf2,hue='Target');
plt.figure(figsize = (15,7))
plt.title('Correlation of Data Columns', y=1.05, size=19)
sns.heatmap(newdf2.corr(), annot=True, fmt='.2f',cmap="YlGnBu")
## Contingency table to get Approved %
def get_contingency_table(df,target,var):
ct_res = pd.crosstab(df[var],df[target],margins=True)
ct_res['Approved (%)']=round(ct_res[1]/ct_res['All']*100,2)
return ct_res.drop(columns=['All'])
newdf2.info()
newdf2.describe().T
# remove the rows of data which have missing value(s)
#No Null Values ..Below statement not required
newdf2 = newdf2.dropna()
#Let's See below what are the attributes which has some meaningless data which are not adding any vals for building better model
newdf2.head()
There are attributes having values like “unknown”, “others”, which are helpless just like missing values. Thus, these ambiguous values are removed from the dataset.
- poutcome has many unknown values which doesn't makes any sense, so we can remove it.
- Let's identify and treat them below
# Step 1: Delete the rows in column 'poutcome' where it contains 'other'
condition = newdf2.poutcome == 'other'
newdf2.drop(newdf2[condition].index, axis = 0, inplace = True)
newdf2.describe()
for col in newdf2.select_dtypes(include='object').columns:
print(col)
print(newdf2[col].unique())
As you can see by the ouput of our unique method there are some values like unknown in job & education column. So we can replace them with more meaningful value called other to make some sense.
Let's treat these attributes below:
newdf2[['job','education']] = newdf2[['job','education']].replace(['unknown'],'other')
#Let's See our dataframe & verify if it has been updated
print("\n\nAfter Treatment", newdf2['education'].count)
newdf2['contact'].value_counts()
insights:
- As contact has almost 29 % data which has uknown value it has no significant role to play in predciting the model outcome.
- contact type seems to be not so important feature here so we can drop it form the dataframe
# Drop column "contact" which seems to be not so useful
new_df1 = newdf2.copy()
newdf2.drop('contact', axis=1, inplace = True)
new_df2 = newdf2.copy()
# Let's get rid of customer values with 'other' in education column as it doesn't make any sense to have such values in making any useful predcition .
logic = (new_df2['education'] == 'other')
new_df2.drop(new_df2[logic].index, axis = 0, inplace = True)
new_df2.info()
# Function to replace marital values with numercial
def marital_num(df):
mar= [df]
for data in mar:
data.loc[data['marital'] == "married", "marital_state"] = 1
data.loc[data['marital'] == "single", "marital_state"] = 2
data.loc[data['marital'] == "divorced", "marital_state"] = 3
#JOB:
def job_num(df):
jb= [df]
for data in jb:
data.loc[data['job'] == "management", "Job_state"] = 1
data.loc[data['job'] == "technician", "Job_state"] = 2
data.loc[data['job'] == "entrepreneur", "Job_state"] = 3
data.loc[data['job'] == "blue-collar", "Job_state"] = 4
data.loc[data['job'] == "retired", "Job_state"] = 5
data.loc[data['job'] == "admin.", "Job_state"] = 6
data.loc[data['job'] == "services", "Job_state"] = 7
data.loc[data['job'] == "self-employed", "Job_state"] = 8
data.loc[data['job'] == "unemployed", "Job_state"] = 9
data.loc[data['job'] == "student", "Job_state"] = 10
data.loc[data['job'] == "housemaid", "Job_state"] = 11
data.loc[data['job'] == "other", "Job_state"] = 12
#Education:
def edu_num(df):
edu= [df]
for data in edu:
data.loc[data['education'] == "primary", "education_state"] = 1
data.loc[data['education'] == "secondary", "education_state"] = 2
data.loc[data['education'] == "tertiary", "education_state"] = 3
data.loc[data['education'] == "unknown", "education_state"] = 4
#
def pout_num(df):
pout= [df]
for data in pout:
data.loc[data['poutcome'] == "failure", "poutcome_state"] = 1
data.loc[data['poutcome'] == "success", "poutcome_state"] = 2
data.loc[data['poutcome'] == "unknown", "poutcome_state"] = 3
marital_num(new_df2)
job_num(new_df2)
edu_num(new_df2)
pout_num(new_df2)
convert(new_df2, "housing_state", "housing")
convert(new_df2, "default_state", "default")
convert(new_df2, "loan_binary", "loan")
convert(new_df2, "Term Deposit Result", "Target")
new_df2.drop(['age','job', 'balance_group','housing','marital', 'default', 'loan', 'housing', 'education', 'month', 'poutcome', 'Target'], axis = 1, inplace = True)
new_df2
new_df2.info()
new_df2['Term Deposit Result'].value_counts()
The target column is having high level of imbalance in terms of yes & no distribution with no leading with 88% and yes only with 12%. This kind of imbalance leads to ambiguity in model accuracy & predcition.We will take care of the data imbalances later on.
from sklearn.metrics import confusion_matrix,classification_report,f1_score, precision_score, recall_score, roc_curve, auc, average_precision_score, roc_auc_score, accuracy_score, precision_recall_curve, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
#Fitting the model
def fit_test_model(model, X_train, y_train, X_test):
# Train the model
model.fit(X_train, y_train)
# Y Hat Prediction on Test Data
model_pred = model.predict(X_test)
return model_pred
# Function to calculate Accuracy Score
def model_accuracy_score(model, X_train, y_train, X_test):
model_pred = fit_test_model(model,X_train, y_train, X_test)
accu_score = accuracy_score(y_test, model_pred)
return accu_score
# Calculate Confusion Matrix & PLot To Visualize it
def draw_confmatrix(y_test, yhat, str1, str2):
cm = confusion_matrix( y_test, yhat, [0,1] )
print("Confusion Matrix Is:", cm )
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [str1, str2] , yticklabels = [str1, str2] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
#Print Classification Report Metrics
def classificationreport(y_test, yhat):
# Make predictions and evalute
class_report= classification_report(y_test, yhat)
return class_report
#Function To plot ROC Curve: For Given Model
def roc_auc_curve(model, X_test,TITLE):
# predict probabilities
probs = model.predict_proba(X_test)[:,1]
# Calculating roc_auc score
rocauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plt.figure(figsize=(10,10))
plt.title(TITLE)
plt.plot(fpr,tpr, color='red',label = 'AUC = %0.2f' % rocauc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
def prec_recall_curve(model,X_train, y_train, X_test, STR):
# predict probabilities
probs = model.predict_proba(X_test)[:,1]
# predict class values
yhat = fit_test_model(model,X_train, y_train, X_test)
#calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, probs)
# calculate F1 score
f1 = f1_score(y_test, yhat)
#calculate precision-recall AUC
aucscore = auc(recall, precision)
# calculate average precision score
ap = average_precision_score(y_test, probs)
print('f1=%.3f auc=%.3f ap=%.3f' % (f1, aucscore, ap))
# plot no skill
plt.figure(figsize=(10,10))
plt.title(STR)
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the precision-recall curve for the model
plt.plot(recall, precision, marker='.')
# show the plot
plt.show()
from sklearn.model_selection import train_test_split
array = new_df2.values
X = array[:,0:15]
y = array[:,15]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
from sklearn.preprocessing import StandardScaler
#STANDARD Sclaer:
stdsc= StandardScaler()
X_train3 = pd.DataFrame(stdsc.fit_transform(X_train))
X_test3 = pd.DataFrame(stdsc.transform(X_test))
StdSc_X_train = X_train3
StdSc_X_test = X_test3
StdSc_X_train.isnull().sum()
#Build the logistic regression model
import statsmodels.api as sm
logit = sm.Logit(y_train, sm.add_constant(X_train3))
lg = logit.fit()
#Summary of logistic regression
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
print(lg.summary())
print(lg.summary2())
#Calculate Odds Ratio, probability
##create a data frame to collate Odds ratio, probability and p-value of the coef
lgcoef = pd.DataFrame(lg.params, columns=['coef'])
lgcoef.loc[:, "Odds_ratio"] = np.exp(lgcoef.coef)
lgcoef['probability'] = lgcoef['Odds_ratio']/(1+lgcoef['Odds_ratio'])
lgcoef['pval']=lg.pvalues
pd.options.display.float_format = '{:.2f}'.format
# FIlter by significant p-value (pval <0.1) and sort descending by Odds ratio
lgcoef = lgcoef.sort_values(by="Odds_ratio", ascending=False)
pval_filter = lgcoef['pval']<=0.05
lgcoef[pval_filter]
def draw_confmatrix(y_test, yhat, str1, str2):
#Make predictions and evalute
#model_pred = fit_test_model(model,X_train, y_train, X_test)
cm = confusion_matrix( y_test, yhat, [0,1] )
print("Confusion Matrix Is:", cm )
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [str1, str2] , yticklabels = [str1, str2] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
lr = LogisticRegression()
dtree = DecisionTreeClassifier(criterion="entropy", max_depth=4)
rfc = RandomForestClassifier(n_estimators=40)
def scorer(i,j,l):
for every in (i,j,l):
every.fit(StdSc_X_train,y_train)
yhat= every.predict(StdSc_X_test)
print("Accuracy Score Is : ", accuracy_score(y_test, yhat))
print(every.__class__.__name__, 'F1 score =', f1_score(y_test, yhat))
print(every.__class__.__name__, 'classification Score =','\n', classification_report(y_test,yhat))
print("Confusion Matrix HeatMap : ",draw_confmatrix(y_test, yhat, "NO FD", "YES FD"))
scorer (lr,dtree,rfc)
Positive/Negative: Type of Class (label) ["No", "Yes"] True/False: Correctly or Incorrectly classified by the model.
True Negatives : This is the number of correctly classifications of the "No" class or potenial clients that are not willing to a term deposit.
False Negatives : This is the number of incorrectly classifications of the "No" class or potential clients that are not willing to a term depositt.
False Positives : This is the number of incorrectly classifications of the "Yes" class or potential clients that are willing to a term deposit.
True Positives : This is the number of correctly classifications of the "Yes" class or potenial clients that are willing to a term deposit.
Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
precision the number of true positives divided by the total number of elements labeled as belonging to the positive class
Recall in this context is defined as the number of true positives divided by the total number of elements that actually belong to the positive class
In our given objective
With this concept let's comapre our metrics and find the best model based on confusion matrix.
DT = DecisionTreeClassifier(criterion="entropy", max_depth=4)
DT.fit(StdSc_X_train, y_train)
LogReg = LogisticRegression()
LogReg.fit(StdSc_X_train, y_train)
#roc_auc_curve()
print(roc_auc_curve(LogReg, StdSc_X_test, "Logistic Regression ROC"))
print(roc_auc_curve(DT, StdSc_X_test,"Decision Tree ROC"))
#roc_auc_curve()
#prec_recall_curve(model,X_train, y_train, X_test, STR)
print(prec_recall_curve(LogReg,StdSc_X_train, y_train, StdSc_X_test, "Logistic Regression Precision-Recall Curve"))
print(prec_recall_curve(DT, StdSc_X_train, y_train, StdSc_X_test,"Decision Tree ROC Precision-Recall Curve"))
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
rfc = RandomForestClassifier(criterion='gini', n_estimators = 1000)
rfc.fit(StdSc_X_train, y_train)
rfcpred = rfc.predict(StdSc_X_test)
draw_confmatrix(y_test, rfcpred,"No FD", "Yes FD")
print("RFC Accuracy Score:",round(accuracy_score(y_test, rfcpred),2)*100)
print("RFC F1 Score ",f1_score(y_test, rfcpred))
print(classificationreport(y_test,rfcpred))
Based on F1 score, accuracy score and recall score , random forrestor seems to be better
Let's further anaylse how RFC fares in roc-auc curve/score and precision-recall curve.
print(roc_auc_curve(rfc, StdSc_X_test, "Random Forrestor ROC Curve"))
print(prec_recall_curve(rfc,StdSc_X_train, y_train, StdSc_X_test, "Random Forrestor ROC Curve Precision-Recall Curve"))
Quick Insights From RFC : Roc & Precision Recall Curve:
- Random forrestor performed much better in terms of area covered under the curve with an auc score of 92 %, which is much better than the decision tree and other individual classification model we compared above
from sklearn.ensemble import AdaBoostClassifier
#abcl = AdaBoostClassifier(base_estimator=dt_model, n_estimators=50)
ada = AdaBoostClassifier( n_estimators= 1000)
ada = ada.fit(StdSc_X_train, y_train)
ada_pred = ada.predict(StdSc_X_test)
draw_confmatrix(y_test, ada_pred,"No FD", "Yes FD")
print("Adaboost Accuracy Score:",round(accuracy_score(y_test, ada_pred),2)*100)
print("Adaboost F1 Score ",f1_score(y_test, ada_pred))
print(classificationreport(y_test,ada_pred))
print(confusion_matrix(y_test, ada_pred ))
print(round(accuracy_score(y_test, ada_pred),2)*100)
print(roc_auc_curve(ada, StdSc_X_test, "Ada Boost ROC Curve"))
print(prec_recall_curve(ada, StdSc_X_train, y_train, StdSc_X_test, "AdaBoost Precision-Recall Curve"))
from sklearn.ensemble import GradientBoostingClassifier
#abcl = AdaBoostClassifier(base_estimator=dt_model, n_estimators=50)
Gfc = GradientBoostingClassifier( n_estimators= 1000)
Gfc = Gfc.fit(StdSc_X_train, y_train)
gfc_pred = Gfc.predict(StdSc_X_test)
draw_confmatrix(y_test, gfc_pred,"No FD", "Yes FD")
print("GBFC Accuracy Score:",round(accuracy_score(y_test, gfc_pred),2)*100)
print("GBFC F1 Score ",f1_score(y_test, gfc_pred))
print(classificationreport(y_test,gfc_pred))
print(confusion_matrix(y_test, gfc_pred ))
print(round(accuracy_score(y_test, gfc_pred),2)*100)
print(roc_auc_curve(Gfc, StdSc_X_test, "Gradient Boost ROC Curve"))
print(prec_recall_curve(Gfc,StdSc_X_train, y_train, StdSc_X_test, "Gradientboost Precision-Recall Curve"))
clf_pruned = LogisticRegression()
clf_pruned = clf_pruned.fit(StdSc_X_train, y_train)
pred_RF= clf_pruned.predict(StdSc_X_test)
#f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
tempResultsDf = pd.DataFrame({'Method':['LogisticRegression'],
'accuracy':[accuracy_score(y_test, pred_RF)],
'f1':[f1_score(y_test,pred_RF)],
'precision_score':[precision_score(y_test,pred_RF)],
'recall_score':[recall_score(y_test,pred_RF)],
'roc_auc_score':[roc_auc_score(y_test,pred_RF)]})
resultsDf = pd.concat([tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy','f1','precision_score', 'recall_score', 'roc_auc_score']]
resultsDf
clf_pruned = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
max_depth=3, min_samples_leaf=5)
clf_pruned = clf_pruned.fit(StdSc_X_train, y_train)
pred_RF= clf_pruned.predict(StdSc_X_test)
#f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
tempResultsDf = pd.DataFrame({'Method':['DecisionTreeClassifier'],
'accuracy':[accuracy_score(y_test, pred_RF)],
'f1':[f1_score(y_test,pred_RF)],
'precision_score':[precision_score(y_test,pred_RF)],
'recall_score':[recall_score(y_test,pred_RF)],
'roc_auc_score':[roc_auc_score(y_test,pred_RF)]})
resultsDf = pd.concat([resultsDf,tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy','f1','precision_score', 'recall_score', 'roc_auc_score']]
resultsDf
new_df2.head()
## Calculating feature importance
xvar = new_df2.drop('Term Deposit Result', axis=1)
feature_cols = xvar.columns
feat_importance = clf_pruned.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(feature_cols, clf_pruned.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
clf_pruned = RandomForestClassifier(n_estimators = 10, random_state=15)
clf_pruned = clf_pruned.fit(StdSc_X_train, y_train)
pred_RF= clf_pruned.predict(StdSc_X_test)
#f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
tempResultsDf = pd.DataFrame({'Method':['RandomForestClassifier'],
'accuracy':[accuracy_score(y_test, rfcpred)],
'f1':[f1_score(y_test,rfcpred)],
'precision_score':[precision_score(y_test,rfcpred)],
'recall_score':[recall_score(y_test,rfcpred)],
'roc_auc_score':[roc_auc_score(y_test,rfcpred)]})
resultsDf = pd.concat([resultsDf,tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy','f1','precision_score', 'recall_score', 'roc_auc_score']]
resultsDf
tempResultsDf = pd.DataFrame({'Method':['AdaBoostClassifier'],
'accuracy':[accuracy_score(y_test, ada_pred)],
'f1':[f1_score(y_test,ada_pred)],
'precision_score':[precision_score(y_test,ada_pred)],
'recall_score':[recall_score(y_test,ada_pred)],
'roc_auc_score':[roc_auc_score(y_test,ada_pred)]})
resultsDf = pd.concat([resultsDf,tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy','f1','precision_score', 'recall_score', 'roc_auc_score']]
resultsDf
from yellowbrick.classifier import ClassificationReport, ROCAUC
# Visualize model performance with yellowbrick library
viz = ClassificationReport(AdaBoostClassifier(n_estimators= 100, learning_rate=0.1, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(AdaBoostClassifier(n_estimators= 100, learning_rate=0.1, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22)
clf_pruned = bag.fit(StdSc_X_train, y_train)
pred_RF= clf_pruned.predict(StdSc_X_test)
#f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
tempResultsDf = pd.DataFrame({'Method':['BaggingClassifier'],
'accuracy':[accuracy_score(y_test, pred_RF)],
'f1':[f1_score(y_test,pred_RF)],
'precision_score':[precision_score(y_test,pred_RF)],
'recall_score':[recall_score(y_test,pred_RF)],
'roc_auc_score':[roc_auc_score(y_test,pred_RF)]})
resultsDf = pd.concat([resultsDf,tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy','f1','precision_score', 'recall_score', 'roc_auc_score']]
resultsDf
tempResultsDf = pd.DataFrame({'Method':['GradientBoostingClassifier'],
'accuracy':[accuracy_score(y_test, gfc_pred)],
'f1':[f1_score(y_test,gfc_pred)],
'precision_score':[precision_score(y_test,gfc_pred)],
'recall_score':[recall_score(y_test,gfc_pred)],
'roc_auc_score':[roc_auc_score(y_test,gfc_pred)]})
resultsDf = pd.concat([resultsDf,tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy','f1','precision_score', 'recall_score', 'roc_auc_score']]
resultsDf
from sklearn.model_selection import GridSearchCV
params_dict={'n_estimators':[5,10,50],'max_features':['auto','sqrt','log2']}
clf_rf = GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=22),param_grid=params_dict,scoring='accuracy',cv=10)
clf_rf.fit(StdSc_X_train,y_train)
pred = clf_rf.predict(StdSc_X_test)
acc_GS = accuracy_score(pred,y_test)
acc_f1=f1_score(pred,y_test)
#f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
tempResultsDf = pd.DataFrame({'Method':['GridSearchCV'],
'accuracy':[accuracy_score(y_test,pred)],
'f1':[f1_score(y_test,pred)],
'precision_score':[precision_score(y_test,pred)],
'recall_score':[recall_score(y_test,pred)],
'roc_auc_score':[roc_auc_score(y_test,pred)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf.reset_index(drop=True).style.apply(lambda x: ['background: lightgreen'
if (x.Method == 'GradientBoostingClassifier')
else '' for i in x], axis=1)
!pip install imblearn
!pip install imblearn==0.0
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_SMOTE, y_SMOTE = sm.fit_sample(StdSc_X_train, y_train)
pd.Series(y_SMOTE).value_counts()
sc = StandardScaler()
sc.fit(X_SMOTE)
X_train_std = sc.transform(X_SMOTE)
X_test_std = sc.transform(StdSc_X_test)
def scorer_bal(i,j):
for model in (i,j):
model.fit(X_train_std,y_SMOTE)
yhat= model.predict(X_test_std)
print("Accuracy Score Is : ", accuracy_score(y_test, yhat))
print(model.__class__.__name__, 'F1 score =', f1_score(y_test, yhat))
print(model.__class__.__name__, 'classification Score =','\n', classification_report(y_test, yhat))
draw_confmatrix(y_test, yhat, "NO FD", "YES FD")
scorer_bal(lr,dtree)
def scorer_bal(i,j):
for model in (i,j):
model.fit(X_train_std,y_SMOTE)
yhat= model.predict(X_test_std)
print("Accuracy Score Is : ", accuracy_score(y_test, yhat))
print(model.__class__.__name__, 'F1 score =', f1_score(y_test, yhat))
print(model.__class__.__name__, 'classification Score =','\n', classification_report(y_test, yhat))
draw_confmatrix(y_test, yhat, "NO FD", "YES FD")
scorer_bal(rfc,Gfc)